ephrin_binding.ipynb¶

Analyze effect of RBP mutations on receptor binding from DMS selection data using soluble bat Ephrin-B2 or -B3

  • Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
# input configs
altair_config = None
nipah_config = None

# input files
binding_E2_file = None
binding_E3_file = None

# output images
entry_binding_combined_corr_plot = None
E2_E3_correlation = None
E2_E3_correlation_site = None
binding_by_site_plot = None
entry_binding_corr_heatmap = None
binding_region_bubble_plot = None
combined_contact_ranked_bar_output = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
entry_binding_combined_corr_plot = (
    "results/images/entry_binding_combined_corr_plot.html"
)
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
binding_by_site_plot = "results/images/binding_by_site_plot.html"
entry_binding_corr_heatmap = "results/images/entry_binding_corr_heatmap.html"
binding_region_bubble_plot = "results/images/binding_region_bubble_plot.html"
combined_contact_ranked_bar_output = (
    "results/images/combined_contact_ranked_bar_output.html"
)

Import libraries¶

In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml

Set working directory¶

In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if (
    os.getcwd()
    == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
):
    pass
    print("Already in correct directory")
else:
    os.chdir(
        "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
    )
    print("Setup in correct directory")
Setup in correct directory

Set hard paths for running in interactive mode¶

In [5]:
if nipah_config is None:
    ##hard paths in case don't want to run with snakemake
    print("loading hard paths")
    altair_config = "data/custom_analyses_data/theme.py"
    nipah_config = "nipah_config.yaml"

    # input files
    binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
    binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"

Run config files to setup altair theme and config variables¶

In [6]:
if altair_config:
    with open(altair_config, "r") as file:
        exec(file.read())

with open(nipah_config) as f:
    config = yaml.safe_load(f)

Import and merge data¶

Import the filtered binding and entry data for bEFNB2 and bEFNB3¶

In [7]:
# import binding data
df_E2_filter = pd.read_csv(binding_E2_file)
display(df_E2_filter.head(3))
df_E3_filter = pd.read_csv(binding_E3_file)
display(df_E3_filter.head(3))
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type
0 71 Q E 0.1659 0.4309 3.5 -1.199 0.3996 5.250 1.0 negative
1 71 Q F -0.3429 0.5299 3.0 -0.947 0.6969 4.625 1.0 aromatic
2 71 Q G 0.4657 0.7042 4.0 -1.374 0.5796 7.750 1.0 special
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type
0 71 Q C -0.1332 0.1525 3.0 -0.7227 0.7828 3.000 1.0 special
1 71 Q D -0.2937 0.3305 4.0 -0.3884 0.6369 3.429 1.0 negative
2 71 Q E -0.3527 0.1762 4.0 -0.2482 0.9791 4.571 1.0 negative

Merge data¶

In [8]:
## Merge the filtered EFNB2 and EFNB3 DataFrames for combined analysis.
df_binding_effect_merge = pd.merge(
    df_E2_filter,
    df_E3_filter,
    on=["site", "wildtype", "mutant"],
    suffixes=["_E2", "_E3"],
    how="outer",
)
display(df_binding_effect_merge.head(3))
## Add a 'selection' column to distinguish between EFNB2 and EFNB3 data.
df_E2_filter["selection"] = "bEFNB2"
df_E3_filter["selection"] = "bEFNB3"

## Concatenate the DataFrames for plotting or further analysis.
df_binding_effect_concat = pd.concat([df_E2_filter, df_E3_filter])
display(df_binding_effect_concat.head(3))
site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3
0 71 Q E 0.1659 0.4309 3.5 -1.199 0.3996 5.250 1.0 negative -0.35270 0.17620 4.0 -0.2482 0.9791 4.571 1.0 negative
1 71 Q F -0.3429 0.5299 3.0 -0.947 0.6969 4.625 1.0 aromatic -0.03982 0.12830 3.0 -0.4973 0.3080 3.286 1.0 aromatic
2 71 Q G 0.4657 0.7042 4.0 -1.374 0.5796 7.750 1.0 special -0.04107 0.05842 4.0 -1.3310 0.8316 4.714 1.0 special
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
0 71 Q E 0.1659 0.4309 3.5 -1.199 0.3996 5.250 1.0 negative bEFNB2
1 71 Q F -0.3429 0.5299 3.0 -0.947 0.6969 4.625 1.0 aromatic bEFNB2
2 71 Q G 0.4657 0.7042 4.0 -1.374 0.5796 7.750 1.0 special bEFNB2

Calculate stats¶

In [9]:
# What are the top 5 highest and lowest binding mutants for EFNB2 and EFNB3?
def find_highest_lowest(df, name):
    print(f"We are analyzing {name}\n")
    print(f"The total number of mutants was: {df.shape[0]}\n")
    tmp_df = df.sort_values(by="binding_mean")
    print("These are the lowest binding mutants detected:")
    display(tmp_df.head(5))

    tmp_df_high = df.sort_values(by="binding_mean", ascending=False)
    print("\nThese are the highest binding mutants detected:\n")
    display(tmp_df_high.head(5))

    # What about mutants with positive entry scores?
    tmp_df = df[df["effect"] > 0].sort_values(by="binding_mean")
    print("These are the lowest binding mutants detected with positive entry scores:")
    display(tmp_df.head(5))

    tmp_df_high = df[df["effect"] > 0].sort_values(by="binding_mean", ascending=False)
    print(
        "\nThese are the highest binding mutants detected with positive entry scores:\n"
    )
    display(tmp_df_high.head(5))

    mean_df = df.groupby('site')['binding_mean'].sum().reset_index()
    print('These are the sites with the highest summed binding score:\n')
    display(mean_df.sort_values(by='binding_mean',ascending=False).head(10))


find_highest_lowest(df_E2_filter, "bEFNB2")
find_highest_lowest(df_E3_filter, "bEFNB3")
We are analyzing bEFNB2

The total number of mutants was: 6632

These are the lowest binding mutants detected:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
5330 501 E K -5.280 0.9284 24.50 -0.4975 0.7581 31.62 1.0 positive bEFNB2
5709 532 A V -5.259 1.1400 16.25 -0.2039 0.4129 17.38 1.0 hydrophobic bEFNB2
1744 236 R H -5.054 0.8828 22.00 -1.0930 0.6155 28.62 1.0 positive bEFNB2
5200 490 Q K -4.966 1.0810 8.50 -0.5442 0.7343 10.62 1.0 positive bEFNB2
5198 490 Q H -4.752 0.8619 8.25 -0.3961 0.7356 10.50 1.0 positive bEFNB2
These are the highest binding mutants detected:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6331 580 I S 2.152 1.7680 4.75 -1.2740 0.7972 6.375 1.0 hydrophilic bEFNB2
2934 331 N E 2.144 1.2330 3.00 -1.0940 0.9094 3.500 1.0 negative bEFNB2
6434 586 N T 1.962 0.5779 5.25 -1.0610 0.9166 7.125 1.0 hydrophilic bEFNB2
1433 211 G F 1.957 0.5488 4.75 -0.9581 0.6185 5.125 1.0 aromatic bEFNB2
6007 553 S W 1.945 0.2867 10.00 -0.2906 0.4253 13.250 1.0 aromatic bEFNB2
These are the lowest binding mutants detected with positive entry scores:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6068 558 A V -4.647 0.7295 17.25 0.21780 0.4495 17.620 1.0 hydrophobic bEFNB2
6056 557 N D -4.627 0.3989 9.50 0.41380 0.5006 10.750 1.0 negative bEFNB2
5704 532 A L -4.577 1.3510 6.00 0.07806 0.4481 7.625 1.0 hydrophobic bEFNB2
5617 526 L H -4.546 0.7822 5.25 0.21200 0.4088 6.625 1.0 positive bEFNB2
6349 581 Y W -4.519 1.0130 10.75 0.14230 0.4792 11.380 1.0 aromatic bEFNB2
These are the highest binding mutants detected with positive entry scores:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
2234 282 C S 1.609 1.0560 4.50 0.38130 0.3526 3.571 0.5 hydrophilic bEFNB2
4683 450 Q I 1.457 1.1940 5.00 0.12350 0.3928 6.000 1.0 hydrophobic bEFNB2
777 164 N S 1.297 1.1360 3.75 0.24060 0.2909 6.000 1.0 hydrophilic bEFNB2
6140 564 N K 1.259 0.4885 7.25 0.07777 0.4298 8.000 1.0 positive bEFNB2
3591 376 K F 1.208 1.7770 3.25 0.13220 0.2972 2.500 1.0 aromatic bEFNB2
These are the sites with the highest summed binding score:

site binding_mean
470 564 15.086900
94 172 13.883700
490 586 13.369440
222 306 12.357770
247 331 11.730800
87 165 8.624400
236 320 8.574200
198 282 8.447490
206 290 8.318651
331 419 8.301810
We are analyzing bEFNB3

The total number of mutants was: 6503

These are the lowest binding mutants detected:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6013 555 D K -2.147 1.2250 6.0 0.08504 0.4221 6.714 1.0 positive bEFNB3
6016 555 D R -1.510 0.4205 4.0 0.07591 0.5260 4.143 1.0 positive bEFNB3
5699 530 Q L -1.447 0.4772 3.5 -1.01900 0.8623 5.857 1.0 hydrophobic bEFNB3
6301 583 T R -1.410 0.4465 5.5 -1.00500 0.4854 6.286 1.0 positive bEFNB3
6298 583 T K -1.247 0.1132 5.0 -0.72780 0.6441 7.000 1.0 positive bEFNB3
These are the highest binding mutants detected:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6347 589 R G 2.006 0.65910 7.0 -0.9655 0.5763 8.286 1.0 special bEFNB3
6267 580 I M 1.337 0.53380 5.5 -0.8155 0.5307 6.714 1.0 hydrophobic bEFNB3
2298 270 V Q 1.329 1.02800 5.0 -0.8846 0.6509 5.429 1.0 hydrophilic bEFNB3
2251 267 M Q 1.291 1.37100 4.5 -0.7981 0.8175 5.667 1.0 hydrophilic bEFNB3
5352 492 Q L 1.261 0.06666 5.5 0.4833 0.1827 5.143 1.0 hydrophobic bEFNB3
These are the lowest binding mutants detected with positive entry scores:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6013 555 D K -2.1470 1.2250 6.0 0.08504 0.42210 6.714 1.0 positive bEFNB3
6016 555 D R -1.5100 0.4205 4.0 0.07591 0.52600 4.143 1.0 positive bEFNB3
5702 530 Q W -0.9974 0.5473 3.0 0.01818 0.66470 3.286 1.0 aromatic bEFNB3
6007 555 D A -0.8295 0.1999 3.5 0.37550 0.23450 3.714 1.0 hydrophobic bEFNB3
4751 441 P V -0.7461 0.1815 5.5 0.37080 0.08612 5.286 1.0 hydrophobic bEFNB3
These are the highest binding mutants detected with positive entry scores:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
5352 492 Q L 1.2610 0.06666 5.5 0.48330 0.1827 5.143 1.0 hydrophobic bEFNB3
1665 211 G F 1.1690 0.28900 5.0 0.16370 0.5716 5.714 1.0 aromatic bEFNB3
6429 598 P G 1.1540 1.55900 4.5 0.23870 0.5146 5.429 1.0 special bEFNB3
5987 553 S W 0.9943 0.43520 9.5 0.16690 0.4060 9.857 1.0 aromatic bEFNB3
6290 582 D W 0.9761 0.33840 6.5 0.06342 0.3096 7.143 1.0 aromatic bEFNB3
These are the sites with the highest summed binding score:

site binding_mean
489 589 11.707770
86 161 7.332800
468 564 6.721080
224 306 6.492740
59 132 6.168804
241 323 6.101950
469 566 5.968996
260 342 5.728920
128 204 5.653570
174 254 5.152500
In [10]:
def overall_stats(df, effect, name):
    # Now group sites and find sites where all mutants are deleterious
    filtered_df = df.groupby("site").filter(lambda group: (group[effect] < -0.25).all())
    # Which sites are these?
    unique = filtered_df["site"].unique()
    # Convert unique to a Pandas Series
    unique_series = pd.Series(unique)

    # Find the common elements that are also contact sites
    unique_contact_bool = unique_series.isin(config["contact_sites"])
    # Filter and get the common elements
    common_elements = unique_series[unique_contact_bool]

    print(f"The dataset we are analyzing is: {name}\n")

    # Print the common elements
    print(
        f"Here are the contact sites that only have negative binding scores: {list(common_elements)}\n"
    )

    print(f"There are {len(unique)} sites with all negative binding score mutants\n")
    print(
        f"These are the sites with all negative binding score mutants: {list(unique)}\n"
    )

    # Now find sites with low and high binding (median)
    median_df = (
        df.groupby("site")["binding_mean"]
        .max()
        .reset_index()
        .sort_values(by="binding_mean", ascending=False)
    )
    print("These are the sites with the highest binding mutants:\n")
    display(median_df.head(5))

    # Now calculate mutant number
    total_mutants = df.shape[0]

    mutants_above_cutoff_tolerated = df[df["effect"] > 0]
    mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[
        ["site", "effect", "binding_mean", "wildtype", "mutant"]
    ]

    total_sites = df["site"].unique().shape[0]

    print(f"The total number of sites are: {total_sites}")


overall_stats(df_E2_filter, "binding_mean", "EFNB2")
overall_stats(df_E3_filter, "binding_mean", "EFNB3")
The dataset we are analyzing is: EFNB2

Here are the contact sites that only have negative binding scores: [239, 242, 389, 488, 490, 491, 501, 504, 505, 531, 532, 533, 557, 579, 581, 588]

There are 43 sites with all negative binding score mutants

These are the sites with all negative binding score mutants: [100, 116, 220, 236, 238, 239, 242, 243, 248, 346, 351, 352, 389, 390, 399, 400, 435, 438, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590]

These are the sites with the highest binding mutants:

site binding_mean
484 580 2.152
247 331 2.144
490 586 1.962
133 211 1.957
459 553 1.945
The total number of sites are: 507
The dataset we are analyzing is: EFNB3

Here are the contact sites that only have negative binding scores: [389, 488, 501, 505, 531, 532]

There are 16 sites with all negative binding score mutants

These are the sites with all negative binding score mutants: [108, 140, 352, 389, 467, 486, 488, 494, 495, 497, 501, 505, 510, 531, 532, 584]

These are the sites with the highest binding mutants:

site binding_mean
489 589 2.006
481 580 1.337
188 270 1.329
185 267 1.291
404 492 1.261
The total number of sites are: 503

Find sites with opposite effects on binding¶

In [11]:
# find sites that are different
def find_biggest_differences(df):
    efnb2_good_efnb3_bad = df[
        (df["binding_mean_E2"] > 0.5) & (df["binding_mean_E3"] < -0.5)
    ].sort_values(by="binding_mean_E2", ascending=False)
    print('mutantions good for efnb2 binding, bad for efnb3 binding:\n')
    display(efnb2_good_efnb3_bad)

    efnb2_bad_efnb3_good = df[
        (df["binding_mean_E2"] < -0.5) & (df["binding_mean_E3"] > 0.5)
    ].sort_values(by="binding_mean_E3", ascending=False)
    print('mutantions bad for efnb2 binding, good for efnb3 binding:\n')
    display(efnb2_bad_efnb3_good)


find_biggest_differences(df_binding_effect_merge)
mutantions good for efnb2 binding, bad for efnb3 binding:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3
5890 546 L H 1.5230 1.1450 5.25 -1.3690 0.4832 6.500 1.0 positive -0.5431 0.6654 3.5 -0.85080 0.5440 5.143 1.0 positive
2512 303 P C 1.3450 1.5770 5.00 -0.7173 0.5901 4.875 1.0 special -0.5659 0.0975 4.0 -0.69680 0.5942 4.857 1.0 special
109 79 N S 1.2920 1.6520 3.00 -0.6820 0.5704 2.375 1.0 hydrophilic -0.6464 0.2595 3.0 -0.12940 0.5688 3.429 1.0 hydrophilic
89 78 D I 0.6586 0.6340 5.25 -0.9271 0.4006 6.500 1.0 hydrophobic -0.5665 0.7552 5.0 -0.51670 0.8447 5.143 1.0 hydrophobic
1612 224 M Q 0.6473 0.8796 3.50 0.1696 0.3110 2.625 1.0 hydrophilic -0.5806 0.3813 3.5 0.20550 0.2920 4.286 1.0 hydrophilic
5552 520 I G 0.5914 1.0130 5.00 -1.3920 0.3251 7.875 1.0 special -0.5305 0.1476 5.5 -1.30900 0.3187 7.000 1.0 special
980 178 V T 0.5288 0.7257 3.75 -0.4327 0.8737 4.625 1.0 hydrophilic -0.5130 0.5995 3.0 -0.01921 0.4254 3.571 1.0 hydrophilic
mutantions bad for efnb2 binding, good for efnb3 binding:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3
6465 588 I P -2.3490 0.8405 5.25 -0.3078 0.2743 4.875 1.0 special 1.1830 0.255300 5.5 -0.6131 0.3819 5.429 1.0 special
3054 338 R G -0.5172 0.5068 4.50 0.3072 0.4297 5.250 1.0 special 0.6836 0.975700 4.0 0.1580 0.3395 3.429 1.0 special
5208 491 S A -0.9317 0.5371 5.50 -0.9330 0.8352 6.500 1.0 hydrophobic 0.6451 0.009846 6.5 0.1429 0.3582 6.143 1.0 hydrophobic
5235 492 Q R -2.8230 0.6111 19.25 -0.1082 0.2778 23.000 1.0 positive 0.6437 0.053880 20.0 0.5095 0.1192 20.570 1.0 positive

Find the top overlapping binders for both bEFNB2 and bEFNB3¶

In [12]:
def find_good_binding_for_both(df):
    e2_good = df.sort_values(by='binding_mean_E2',ascending=False)
    e3_good = df.sort_values(by='binding_mean_E3',ascending=False)
    e2_good = e2_good.head(50)
    e3_good = e3_good.head(50)
    combo = pd.merge(e2_good,e3_good,on=['site','wildtype','mutant'],how='inner')
    display(combo)

find_good_binding_for_both(df_binding_effect_merge)
site wildtype mutant binding_mean_E2_x binding_std_E2_x times_seen_binding_E2_x effect_E2_x effect_std_E2_x times_seen_cell_entry_E2_x frac_models_E2_x ... frac_models_E2_y mutant_type_E2_y binding_mean_E3_y binding_std_E3_y times_seen_binding_E3_y effect_E3_y effect_std_E3_y times_seen_cell_entry_E3_y frac_models_E3_y mutant_type_E3_y
0 580 I S 2.152 1.7680 4.75 -1.2740 0.7972 6.375 1.0 ... 1.0 hydrophilic 0.8187 0.030890 6.0 0.1163 0.3235 5.571 1.0 hydrophilic
1 211 G F 1.957 0.5488 4.75 -0.9581 0.6185 5.125 1.0 ... 1.0 aromatic 1.1690 0.289000 5.0 0.1637 0.5716 5.714 1.0 aromatic
2 553 S W 1.945 0.2867 10.00 -0.2906 0.4253 13.250 1.0 ... 1.0 aromatic 0.9943 0.435200 9.5 0.1669 0.4060 9.857 1.0 aromatic
3 139 N W 1.930 0.4834 5.75 -0.9970 0.7668 8.125 1.0 ... 1.0 aromatic 1.0030 0.004575 4.5 -0.4441 0.4452 6.286 1.0 aromatic
4 306 N R 1.523 1.8470 3.75 -1.3350 0.5085 5.750 1.0 ... 1.0 positive 0.9003 0.713800 4.5 -0.8509 0.5830 6.286 1.0 positive

5 rows × 35 columns

Find sites with the largest absolute difference in binding¶

In [13]:
# Compare E2 and E3 binders
def find_highest_lowest(df):
    df["binding_diff"] = (df["binding_mean_E2"] - df["binding_mean_E3"]).abs()
    print(
        "These are the mutants with the biggest difference between EFNB2 and EFNB3:\n"
    )
    display(df.sort_values(by="binding_diff", ascending=False).head(10))

    # calculate aggregate differences
    agg_df = (
        df.groupby("site")[["binding_mean_E2", "binding_mean_E3", "binding_diff"]]
        .mean()
        .reset_index()
    )
    print("These are the sites with the biggest difference between EFNB2 and EFNB3:\n")
    display(agg_df.sort_values(by="binding_diff", ascending=False).head(5))


find_highest_lowest(df_binding_effect_merge)
These are the mutants with the biggest difference between EFNB2 and EFNB3:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3 binding_diff
5682 530 Q F -4.015 0.46650 6.25 0.49310 0.2289 5.250 1.0 aromatic 0.084370 0.06198 6.0 0.3592 0.2738 6.286 1.0 aromatic 4.099370
5207 490 Q Y -4.290 0.90020 5.50 -0.87840 0.7489 6.375 1.0 aromatic -0.461000 0.57400 5.5 -0.3137 0.7967 5.714 1.0 aromatic 3.829000
5212 491 S H -4.505 0.93500 8.50 0.17270 0.3082 9.750 1.0 positive -0.871800 0.78980 7.0 -1.3810 0.4865 8.286 1.0 positive 3.633200
5211 491 S G -4.181 0.59490 6.50 -0.05616 0.6193 8.375 1.0 special -0.562100 0.04726 5.5 -0.2358 0.6084 6.286 1.0 special 3.618900
5206 490 Q W -4.205 1.00500 3.75 0.25840 0.3965 4.625 1.0 aromatic -0.586500 0.32260 3.0 -0.1506 0.4575 3.286 1.0 aromatic 3.618500
6465 588 I P -2.349 0.84050 5.25 -0.30780 0.2743 4.875 1.0 special 1.183000 0.25530 5.5 -0.6131 0.3819 5.429 1.0 special 3.532000
5231 492 Q K -3.523 0.54860 11.00 0.39690 0.2904 11.620 1.0 positive 0.006126 0.07740 10.0 0.3291 0.2404 11.570 1.0 positive 3.529126
5235 492 Q R -2.823 0.61110 19.25 -0.10820 0.2778 23.000 1.0 positive 0.643700 0.05388 20.0 0.5095 0.1192 20.570 1.0 positive 3.466700
1786 239 S H -3.440 0.08816 5.25 0.36800 0.3143 4.500 1.0 positive -0.041160 0.83220 4.0 -1.2190 0.6930 5.000 1.0 positive 3.398840
5691 530 Q V -4.327 0.83770 6.00 0.16180 0.3502 8.500 1.0 hydrophobic -0.987000 0.89600 5.5 -1.4430 0.7389 7.000 1.0 hydrophobic 3.340000
These are the sites with the biggest difference between EFNB2 and EFNB3:

site binding_mean_E2 binding_mean_E3 binding_diff
415 505 -3.928250 -0.770900 2.897100
439 530 -3.663994 -0.559266 2.797646
405 491 -3.603113 -0.364667 2.785117
495 588 -3.567800 0.037600 2.686600
404 490 -3.812286 -0.370980 2.497620

Make plots¶

Make plots showing correlation between binding and entry for EFNB2 and EFNB3¶

In [14]:
def plot_corr_binding_entry_updated(df, flag):
    df = df.copy().round(2)
    # Define interactive selectors for variant selection.
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site", "mutant"], value=0
    )
    # 'variant_selector_agg' is for aggregated selection based on 'site' only.
    variant_selector_agg = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )

    # Initialize an empty list to store charts for each unique selection in 'df'.
    empty_chart = []
    
    # Loop through each unique cell selection in the DataFrame.
    for cell in list(df["selection"].unique()):
        # Filter DataFrame for the current selection.
        tmp_df = df[df["selection"] == cell]
        
        # Check if aggregation flag is True to aggregate data.
        if flag == True:
            # Aggregate data by 'site', summing 'binding_median' and 'effect', then reset index.
            agg_df = (
                tmp_df.groupby("site")[["binding_mean", "effect"]].mean().reset_index().round(2)
            )
            # Create a chart using aggregated data with specified encodings.
            chart = (
                alt.Chart(agg_df)
                .mark_point(stroke="black", filled=True)  # Use filled points with black stroke.
                .encode(
                    x=alt.X(
                        "effect",
                        title=f"Mean entry in CHO-{cell}",
                        axis=alt.Axis(grid=True),
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"Mean {cell} binding",
                        axis=alt.Axis(grid=True),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector_agg'.
                    opacity=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0.2)
                    ),
                    size=alt.condition(
                        variant_selector_agg, alt.value(100), alt.value(50)
                    ),
                    strokeWidth=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector_agg, alt.value("orange"), alt.value("black")
                    ),
                    tooltip=["site", "binding_mean", "effect"],
                )
                .add_params(variant_selector_agg)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

        else:
            # Create a chart for individual data points with specified encodings.
            chart = (
                alt.Chart(tmp_df)
                .mark_point(stroke="black", filled=True)
                .encode(
                    x=alt.X(
                        "effect", title=f"Entry in CHO-{cell}", axis=alt.Axis(grid=True)
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"{cell} binding",
                        axis=alt.Axis(grid=True),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector'.
                    opacity=alt.condition(
                        variant_selector, alt.value(1), alt.value(0.1)
                    ),
                    size=alt.condition(variant_selector, alt.value(50), alt.value(20)),
                    strokeWidth=alt.condition(
                        variant_selector, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector, alt.value("orange"), alt.value("black")
                    ),
                    tooltip=[
                        "site",
                        "wildtype",
                        "mutant",
                        "binding_mean",
                        "times_seen_binding",
                        "effect",
                    ],
                )
                .add_params(variant_selector)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

    # Combine all charts horizontally with a title.
    combined_chart = alt.hconcat(
        *empty_chart, title=alt.Title("Correlation between binding and entry")
    )
    
    # Return the combined chart for display or further use.
    return combined_chart

# Generate and display plots for non-aggregated data.
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat, False)
entry_binding_corr_plot.display()

# Save the plot 
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_plot.save(entry_binding_combined_corr_plot)

# Repeat for aggregated data.
entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(df_binding_effect_concat, True)
entry_binding_corr_plot_agg.display()

Same plot as above, but slightly different format¶

In [15]:
def plot_entry_binding_corr_heatmap(df):
    empty_chart = []
    for cell in list(df["selection"].unique()):
        tmp_df = df[df["selection"] == cell]
        chart = (
            alt.Chart(tmp_df, title=f"{cell}")
            .mark_rect()
            .encode(
                x=alt.X(
                    "effect", title="Cell entry", axis=alt.Axis(values=[-2, -1, 0, 1])
                ).bin(maxbins=50),
                y=alt.Y(
                    "binding_mean",
                    title="Receptor binding",
                    axis=alt.Axis(values=[-4, -2, 0, 2]),
                ).bin(maxbins=50),
                color=alt.Color("count()", title="Count").scale(type='log'),
            )
        ).properties(width=200,height=200)
        empty_chart.append(chart)

    combined_chart = alt.hconcat(
        *empty_chart, 
    ).resolve_scale(y="shared", x="shared", color="shared")
    return combined_chart


entry_binding_corr_heat = plot_entry_binding_corr_heatmap(df_binding_effect_concat)
entry_binding_corr_heat.display()
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_heat.save(entry_binding_corr_heatmap)

Find correlations between bEFNB2 and bEFNB3 binding¶

In [16]:
def plot_entry_binding_corr(df):
    chart = (
        alt.Chart(df, title="Correlation Between Mutant Binding Scores")
        .mark_rect()
        .encode(
            x=alt.X(
                "binding_mean_E2",
                title="bEFNB2 binding",
                axis=alt.Axis(values=[-4,-2, 0, 2]),
            ).bin(maxbins=50),
            y=alt.Y(
                "binding_mean_E3",
                title="bEFNB3 binding",
                axis=alt.Axis(values=[-2, 0, 2]),
            ).bin(maxbins=50),
            color=alt.Color("count()", title="Count").scale(type='log'),
        )
    ).properties(width=200,height=200)
    return chart


entry_binding_corr_heatmap_1 = plot_entry_binding_corr(df_binding_effect_merge)
entry_binding_corr_heatmap_1.display()

Plot correlations of binding mutants in scatterplots, and color different subsets¶

First, find mutations that are outliers in the correlation between bEFNB2 and bEFNB3 binding¶

In [17]:
def find_outliers(df):
    df = df.dropna().copy()
    #Calculate the best fit line
    slope, intercept = np.polyfit(df['binding_mean_E2'], df['binding_mean_E3'], 1)
    
    #compute residuals
    df['predicted_y'] = slope * df['binding_mean_E2'] + intercept
    df['residuals'] = df['binding_mean_E3'] - df['predicted_y']
    
    #identify outliers
    # calculate the mean and standard deviation of the residuals
    mean_residual = np.mean(df['residuals'])
    std_residual = np.std(df['residuals'])

    outlier_threshold = 4.5 #4.5 std deviations
    df['is_outlier'] = abs(df['residuals']) > (std_residual * outlier_threshold)
    
    # Filter the DataFrame to only include outliers
    outliers = df[df['is_outlier']]
    print(f'Here are the outlier mutations outside of a {outlier_threshold} standard deviation:\n')
    display(outliers)
    outliers_list = list(outliers['site'].unique())
    print(f' Here are the sites: \n {outliers_list}')
    return df,outliers_list

residuals_df,outliers_list = find_outliers(df_binding_effect_merge)
Here are the outlier mutations outside of a 4.5 standard deviation:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 ... times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3 binding_diff predicted_y residuals is_outlier
109 79 N S 1.2920 1.6520 3.00 -0.6820 0.5704 2.375 1.0 ... 3.0 -0.12940 0.5688 3.429 1.0 hydrophilic 1.93840 0.271046 -0.917446 True
555 137 S D -0.4362 0.8381 4.00 -1.2230 0.4498 7.000 1.0 ... 4.0 -0.41160 0.7602 3.571 1.0 negative 1.23990 -0.118953 0.922653 True
609 143 N Q 0.4306 0.6943 5.25 -0.1664 0.5468 8.250 1.0 ... 4.5 -0.35540 0.7932 4.286 1.0 hydrophilic 1.43260 0.076656 -1.078656 True
752 161 S E 0.5686 0.6514 2.50 -1.2950 0.6283 5.750 1.0 ... 3.0 -0.14880 0.4404 3.143 1.0 negative 0.54040 0.107798 1.001202 True
5232 492 Q L -0.2126 0.2587 5.50 0.4493 0.3064 4.625 1.0 ... 5.5 0.48330 0.1827 5.143 1.0 hydrophobic 1.47360 -0.068493 1.329493 True
5235 492 Q R -2.8230 0.6111 19.25 -0.1082 0.2778 23.000 1.0 ... 20.0 0.50950 0.1192 20.570 1.0 positive 3.46670 -0.657576 1.301276 True
5682 530 Q F -4.0150 0.4665 6.25 0.4931 0.2289 5.250 1.0 ... 6.0 0.35920 0.2738 6.286 1.0 aromatic 4.09937 -0.926572 1.010942 True
6032 555 D K -3.3210 0.6425 8.00 0.2361 0.3405 10.000 1.0 ... 6.0 0.08504 0.4221 6.714 1.0 positive 1.17400 -0.769958 -1.377042 True
6035 555 D R -2.4650 1.6330 3.00 0.3892 0.1980 4.500 1.0 ... 4.0 0.07591 0.5260 4.143 1.0 positive 0.95500 -0.576787 -0.933213 True
6321 580 I C 0.5076 0.9803 3.25 -1.4830 0.4867 8.000 1.0 ... 4.0 -0.31540 0.4011 4.857 1.0 special 0.73340 0.094033 1.146967 True
6465 588 I P -2.3490 0.8405 5.25 -0.3078 0.2743 4.875 1.0 ... 5.5 -0.61310 0.3819 5.429 1.0 special 3.53200 -0.550609 1.733609 True

11 rows × 23 columns

 Here are the sites: 
 [79, 137, 143, 161, 492, 530, 555, 580, 588]

Find mutations in the top quantile of binding for bEFNB2 and bEFNB3 both¶

In [18]:
def find_top_for_both(df):
    quantile_threshold = 0.99
    # Calculate the quantiles for both variables
    x_quantile = df['binding_mean_E2'].quantile(quantile_threshold)
    y_quantile = df['binding_mean_E3'].quantile(quantile_threshold)
    
    # Filter points that are above the quantile threshold
    df['meets_threshold'] = (df['binding_mean_E2'] >= x_quantile) & (df['binding_mean_E3'] >= y_quantile)
    subset = df.query('meets_threshold == True')
    top_mutants = list(subset['site'].unique())
    return df, top_mutants

top_residuals_df,top_mutants_list = find_top_for_both(residuals_df)
print(f' The sites with the top binding mutations are : \n {top_mutants_list}')
cleaned_df = top_residuals_df.query('meets_threshold == True')[['wildtype','site','mutant','binding_mean_E2','binding_mean_E3']].round(2)
print(f'Here are the specific mutations:\n')
display(cleaned_df)
 The sites with the top binding mutations are : 
 [104, 139, 211, 268, 306, 508, 553, 580, 589]
Here are the specific mutations:

wildtype site mutant binding_mean_E2 binding_mean_E3
386 E 104 T 1.93 0.65
582 N 139 W 1.93 1.00
1433 G 211 F 1.96 1.17
2015 T 268 W 1.52 0.77
2561 N 306 M 1.49 0.66
2564 N 306 R 1.52 0.90
2566 N 306 T 1.39 0.72
5398 Y 508 Q 1.93 0.64
6007 S 553 W 1.94 0.99
6331 I 580 S 2.15 0.82
6480 R 589 V 1.67 0.73

Plot correlations between individual mutational effects on binding, and color different subsets of sites¶

In [19]:
def plot_affinity_BLI_mutants(df, highlight_conditions, color,subset=None):
    df = df.dropna().round(2).copy()
    df['site_mutant'] = df['site'].astype(str) + df['mutant'].astype(str)

    if subset is not None:
        df = df[df['site'].isin(subset)]

    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        df["binding_mean_E2"], df["binding_mean_E3"]
    )
    
    # make correlation chart
    chart = alt.Chart(df,title=alt.Title("Correlation Between Mutant Binding Scores",subtitle=f'r = {r_value:.2f}')
    ).mark_point(
        color="gray", 
        size=30, 
        opacity=0.4, 
        filled=True
    ).encode(
        x=alt.X("binding_mean_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=4)),
        y=alt.Y("binding_mean_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=4)),
        tooltip=[
            "site",
            "wildtype",
            "mutant",
            "binding_mean_E2",
            "binding_mean_E3",
            "effect_E2",
            "effect_E3",
            "binding_std_E2",
            "binding_std_E3",
            "times_seen_binding_E2",
            "times_seen_binding_E3"
        ],
    )

    #make colored circles for specific data
    highlight = chart.transform_filter(highlight_conditions).mark_point(
        color=color, size=60, opacity=1, filled=True, stroke='black', strokeWidth=1
    )
    #write text near the orange circles
    text_on_point = chart.transform_filter(highlight_conditions).mark_text(
            align='center',baseline='top',dy=-20,fontSize=16
    ).encode(text='site_mutant')
    
    min = int(df["binding_mean_E2"].min())
    max = int(df["binding_mean_E3"].max())
    text = (
        alt.Chart({"values": [{"x": min, "y": max, "text": f"r = {r_value:.2f}"}]})
        .mark_text(align="left", baseline="top", dx=-10, dy=-20)
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    chart_and_text = chart + text_on_point + highlight #+ text
    return chart_and_text.properties(height=300,width=300)

Sites selected for BLI validation¶

In [20]:
# these are the data we want to show in orange circles
highlight_conditions = (
        (alt.datum.site == 244) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 305) & (alt.datum.mutant == 'W') | 
        (alt.datum.site == 492) & (alt.datum.mutant == 'L') |
        (alt.datum.site == 507) & (alt.datum.mutant == 'I') |
        (alt.datum.site == 530) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 553) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 555) & (alt.datum.mutant == 'Y') |
        (alt.datum.site == 559) & (alt.datum.mutant == 'R') |
        (alt.datum.site == 588) & (alt.datum.mutant == 'V') 
)

E2_E3_corr_BLI_mutants = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions,'#e49444')
E2_E3_corr_BLI_mutants.display()

Sites selected for neutralization validations¶

In [21]:
highlight_conditions_neut = (
        (alt.datum.site == 333) & (alt.datum.mutant == 'Q') |
        (alt.datum.site == 492) & (alt.datum.mutant == 'R') | 
        (alt.datum.site == 507) & (alt.datum.mutant == 'I') |
        (alt.datum.site == 530) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 553) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 555) & (alt.datum.mutant == 'K') 
)
neut_muts = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions_neut,'#5778a4')
neut_muts.display()

Find outliers¶

In [22]:
highlight_conditions = (
        (alt.datum.is_outlier == True)
)
E2_E3_corr = plot_affinity_BLI_mutants(residuals_df,highlight_conditions,'#d1615d')
E2_E3_corr.display()
if entry_binding_combined_corr_plot is not None:
    E2_E3_corr.save(E2_E3_correlation)

Find top mutations for both bEFNB2 and bEFNB3¶

In [23]:
highlight_conditions = (
        (alt.datum.meets_threshold == True)
)
E2_E3_corr = plot_affinity_BLI_mutants(top_residuals_df.query('meets_threshold == True'),highlight_conditions,'#d1615d')
E2_E3_corr.display()

Plot correlations of mutants for individual sites of interest with letters of each mutant plotted¶

In [24]:
def plot_affinity_individual_mutants(df,mutant):
    df = df.dropna().round(2).copy()
    df = df[df['site'] == mutant]
    if df.empty:
        print('nothing here')
        pass
    else:
        wildtype_site = (df['wildtype'].astype(str) + df['site'].astype(str)).unique()[0]

        chart = alt.Chart(df,title=alt.Title(f"Site {wildtype_site}")
        ).mark_text(
            size=15
        ).encode(
            x=alt.X("binding_mean_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=3)),
            y=alt.Y("binding_mean_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=3)),
            text=alt.Text('mutant'),
            color=alt.Color('mutant_type_E2',title='Amino acid type'),
            tooltip=[
                "site",
                "wildtype",
                "mutant",
                "binding_mean_E2",
                "binding_mean_E3",
                "effect_E2",
                "effect_E3",
            ],
        )
    
        return chart.properties(height=200,width=200)

Plot amino acid letter correlation plots for the top sites¶

In [25]:
empty_charts = []
for site in top_mutants_list:
    indiv_graph = plot_affinity_individual_mutants(df_binding_effect_merge,site)
    empty_charts.append(indiv_graph)

top_sites = alt.vconcat(*empty_charts).resolve_scale(x='shared',y='shared',color='shared')
top_sites.display()

Plot amino acid letter correlation plots for the outlier sites¶

In [26]:
empty_charts = []
for site in outliers_list:
    if site < 180:
        pass
    else:
        indiv_graph = plot_affinity_individual_mutants(df_binding_effect_merge,site)
        empty_charts.append(indiv_graph)

outlier_sites = alt.vconcat(*empty_charts).resolve_scale(x='shared',y='shared',color='shared')
outlier_sites.display()

Now make for all contact sites¶

In [27]:
empty_charts = []
for site in config['contact_sites']:
    tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
    non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
    non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()

    if non_nan_in_effect_E2 and non_nan_in_effect_E3:        
        contact_plots = plot_affinity_individual_mutants(tmp_df,site)
        empty_charts.append(contact_plots)
    else:
        pass

all_contact_plots = alt.vconcat(*empty_charts).resolve_scale(x='shared', y='shared')
all_contact_plots.display()

Make correlation amino acid letter plots for 580-590 loop¶

In [28]:
for site in list(range(580,590)):
    tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
    non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
    non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()

    if non_nan_in_effect_E2 and non_nan_in_effect_E3:        
        test_plots = plot_affinity_individual_mutants(tmp_df,site)
        test_plots.display()
    else:
        pass

Plot correlations between each site for mean value¶

In [29]:
def plot_affinity_solid_mean(df):
    df = df.dropna()
    means = (
        df.groupby("site")
        .agg(
            {
                "effect_E2": "mean",
                "effect_E3": "mean",
                "binding_mean_E2": "mean",
                "binding_mean_E3": "mean",
                "wildtype": "first",
            }
        )
        .reset_index().round(2)
    )
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        means["binding_mean_E2"], means["binding_mean_E3"]
    )
    r_value = float(r_value)
    chart = (
        alt.Chart(
            means,
            title=alt.Title(
                "Correlation between Aggregate Mutant Binding Scores",
                subtitle=f"r={r_value:.2f}",
            ),
        )
        .mark_point(size=50, opacity=1,filled=True,color='black')
        .encode(
            x=alt.X(
                "binding_mean_E2",
                title=("Mean bEFNB2 binding"),
                axis=alt.Axis(tickCount=3),
            ),
            y=alt.Y(
                "binding_mean_E3",
                title=("Mean bEFNB3 binding"),
                axis=alt.Axis(tickCount=3),
            ),
            tooltip=[
                "site",
                "wildtype",
                "binding_mean_E2",
                "binding_mean_E3",
                "effect_E2",
                "effect_E3",
            ],
        )
    )
    text = (
        alt.Chart({"values": [{"x": -3.5, "y": 0.5, "text": f"r = {r_value:.2f}"}]})
        .mark_text(align="left", baseline="top", dx=0, dy=0)
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    chart_and_text = chart.properties(width=200,height=200)
    return chart_and_text


E2_E3_site_corr = plot_affinity_solid_mean(df_binding_effect_merge)
E2_E3_site_corr.display()
if entry_binding_combined_corr_plot is not None:
    E2_E3_site_corr.save(E2_E3_correlation_site)

Make plot showing binding by site¶

In [30]:
def plot_affinity_by_site(df):
     # define ranges of different RBP regions
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
    }
    
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
    
    agg_means = [] #store aggregation 
    
    # For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = df[df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {"region": barrel, "binding_mean": row["binding_mean"], "site": row["site"],"selection": row["selection"]}
            )
        agg_means_df = pd.DataFrame(agg_means).round(2)
    
    # Setup interactivity
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )
    #make chart
    chart = alt.Chart(agg_means_df).mark_bar(stroke='black',size=1.5,binSpacing=0,color='black').encode(
        alt.X("site")
            .title("Site")
            .axis(tickCount=5,labelAngle=-90,grid=True),
            #.scale(domain=[70, 602]),
        
        alt.Y('mean(binding_mean)')
            .axis(tickCount=3)
            .title('Mean binding'),
        alt.Row('selection',title=None),
        tooltip=['site'],
        strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
        color=alt.Color('region',sort=custom_order,title='Region',scale=alt.Scale(range=['#5778a4', '#e49444', '#d1615d', '#85b6b2'])),
        #strokeColor=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),


    ).properties(height=150, width=800).add_params(variant_selector).resolve_scale(y='independent')
    

    return chart


binding_by_site = plot_affinity_by_site(df_binding_effect_concat)
binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
    binding_by_site.save(binding_by_site_plot)

Make bar chart showing max binding score for each contact residue¶

In [31]:
def plot_affinity_by_contact_site(df, sites_to_show):
    # Filter the DataFrame based on the sites to show
    contact_df = df[df["site"].isin(sites_to_show)]
    
    # Define a selection for highlighting bars on hover
    variant_selector = alt.selection_point(on="mouseover", nearest=True, empty=False, fields=["site"])
    
    # Create the chart
    chart = alt.Chart(contact_df).mark_bar(stroke='black').encode(
        y=alt.Y("site:N", title="Site",sort='-x'),
        x=alt.X("max(binding_mean):Q", title="Max binding mutant at site"),
        color=alt.condition(variant_selector, alt.value("orange"), alt.value("black")),
        strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
        column=alt.Column('selection',title=None)
    ).add_params(variant_selector).properties(width=200, height=alt.Step(12)).resolve_scale(x='independent',y='shared').configure_header(
        labelFontSize=20,  
        labelAngle=0,
        labelAlign='center',
        labelAnchor='middle',
        labelFont='Helvetica Light',
        labelFontStyle='bold',
        labelPadding=3
    )
    
    return chart
In [32]:
contact_binding_by_site = plot_affinity_by_contact_site(df_binding_effect_concat, config["contact_sites"])
contact_binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
    contact_binding_by_site.save(combined_contact_ranked_bar_output)

Make bubble plots for binding in different areas of receptor pocket¶

In [33]:
def make_bubble_binding_region(df):
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
        "Receptor Contact": config["contact_sites"],
    }
    custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact"]
    agg_means = []

    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = df[df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {
                    "region": barrel,
                    "binding_mean": row["binding_mean"],
                    "site": row["site"],
                    "mutant": row["mutant"],
                    "selection": row["selection"],
                }
            )
        agg_means_df = pd.DataFrame(agg_means)

    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["site", "mutant"], value=1
    )

    chart = alt.Chart(agg_means_df).mark_point(stroke="black",filled=True).encode(
        x=alt.X(
            "region:O",
            sort=custom_order,
            title="RBP Region",
            axis=alt.Axis(labelAngle=-90),
        ),
        y=alt.Y(
            "binding_mean",
            title="Binding",
            axis=alt.Axis(tickCount=4),
        ),
        xOffset="random:Q",
        tooltip=["region", "binding_mean", "site", "mutant"],
        color=alt.condition(
            variant_selector, alt.value("orange"), alt.value("black")
        ),
        opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
        strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
        size=alt.condition(variant_selector, alt.value(70), alt.value(20)),
        column=alt.Column('selection',title=None)
    ).transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())"
    ).properties(
        width=300, 
        height=300
    ).add_params(variant_selector
    ).resolve_scale(y='independent'
    ).configure_header(
        labelFontSize=20,  
        labelAngle=0,
        labelAlign='center',
        labelAnchor='middle',
        labelFont='Helvetica Light',
        labelFontStyle='bold',
        labelPadding=1
    )
    
    return chart


entry_region_bubble = make_bubble_binding_region(df_binding_effect_concat)
entry_region_bubble.display()
if entry_binding_combined_corr_plot is not None:
    entry_region_bubble.save(binding_region_bubble_plot)

Plot effects of selected mutants on cell entry¶

In [34]:
def plot_effects_of_mutants(df):
    tmp_df = df.copy()
    tmp_df = tmp_df[
        #((tmp_df['site'] == 580) & (tmp_df['mutant'] == 'S')) |
        ((tmp_df['site'] == 598) & (tmp_df['mutant'] == 'G')) |
        #((tmp_df['site'] == 492) & (tmp_df['mutant'] == 'L')) |
        ((tmp_df['site'] == 553) & (tmp_df['mutant'] == 'W')) |
        #((tmp_df['site'] == 588) & (tmp_df['mutant'] == 'P')) |
        #((tmp_df['site'] == 492) & (tmp_df['mutant'] == 'R')) |
        #((tmp_df['site'] == 530) & (tmp_df['mutant'] == 'F')) |
        #((tmp_df['site'] == 492) & (tmp_df['mutant'] == 'K')) |
        #((tmp_df['site'] == 239) & (tmp_df['mutant'] == 'H')) |
        ((tmp_df['site'] == 211) & (tmp_df['mutant'] == 'F')) |
        ((tmp_df['site'] == 546) & (tmp_df['mutant'] == 'H')) |
        ((tmp_df['site'] == 143) & (tmp_df['mutant'] == 'Q')) |
        ((tmp_df['site'] == 331) & (tmp_df['mutant'] == 'E')) |
        ((tmp_df['site'] == 566) & (tmp_df['mutant'] == 'C')) 

        #((tmp_df['site'] == 589) & (tmp_df['mutant'] == 'G')) 
    ]
    tmp_df['label'] = (tmp_df['wildtype'].astype(str) + tmp_df['site'].astype(str) + tmp_df['mutant'].astype(str))

    tmp_df = tmp_df.sort_values(by='site')
    binding_chart = alt.Chart(tmp_df).mark_bar().encode(
        alt.X("label:N",title='Mutant',sort=None),
        alt.Y("binding_mean:Q",title='Binding score',axis=alt.Axis(tickCount=4)),
        xOffset="selection:N",
        color=alt.Color("selection:N",title='Receptor selection'),
        #row='selection'
    )
    binding_chart.resolve_scale(y='independent',x='shared').display()
plot_effects_of_mutants(df_binding_effect_concat)
In [ ]: